/** * */ package outputter.evaluation; import java.io.PrintWriter; import java.io.StringWriter; import java.io.File; import java.sql.DriverManager; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.sql.Connection; import java.util.ArrayList; import java.util.Enumeration; import java.util.HashSet; import java.util.Hashtable; import java.util.Iterator; import org.apache.log4j.Logger; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.semanticweb.owlapi.model.OWLClass; import org.semanticweb.owlapi.model.OWLOntologyCreationException; import outputter.ApplicationUtilities; import outputter.XML2EQ; import outputter.knowledge.Dictionary; import outputter.knowledge.ELKReasoner; import owlaccessor.OWLAccessorImpl; /** * @author Hariharan * Used the existing EQPerformance and enhanced it to work with upgraded algorithm * */ @SuppressWarnings("unused") public class EQPerformanceEvaluation { private static final Logger LOGGER = Logger.getLogger(EQPerformanceEvaluation.class); private Connection conn; private String testtable; private String answertable; private String prtablefields; private String prtableEQs; private String prtablestates; private boolean printfields = false; private boolean printEQs = false; private boolean printTranslations = false; private ArrayList<ArrayList<Hashtable<String,String>>> astates; private ArrayList<ArrayList<Hashtable<String,String>>> tstates; private Hashtable<String,Hashtable<String,Float>> substringcache; private Hashtable<String,Hashtable<String,String>> equivalencecache; private Hashtable<String,String> existscache = new Hashtable<String,String>();// holds whether a id exists in ontology private Hashtable<String,Hashtable<String,Hashtable<String,String>>> Fieldgsnotontology = new Hashtable<String,Hashtable<String,Hashtable<String,String>>>(); // Stores fieldid<stateid,<original string,modified string>> static String relation ="inheres_in|adjacent_to|distal_to|OBO_REL_part_of|part of|inheres in|adjacent to|distal to|PHENOSCAPE_complement_of|complement of|and|some|bearer_of|anterior_to|anteriorly_connected_to|attaches_to|extends_from|connected_to|decreased_in_magnitude_relative_to|deep_to|develops_from|distal_to|distally_connected_to|dorsal_to|encloses|extends_to|has_cross_section|has_muscle_insertion|has_muscle_origin|has_part|in_anterior_side_of|in_distal_side_of|in_lateral_side_of|in_left_side_of|in_median_plane_of|in_posterior_side_of|in_proximal_side_of|in_right_side_of|increased_in_magnitude_relative_to|located_in|overlaps|part_of|passes_through|posterior_to|posteriorly_connected_to|proximal_to|proximally connected to|similar_in_magnitude_relative_to|surrounded by|surrounds|ventral_to|vicinity_of|serves_as_attachment_site_for|inheres_in|not"; static String relationid = relation+ "|BFO_0000050|BFO_0000052|BFO_0000053|BFO:0000053|RO:0002220|BSPO:0000096|UBERON:anteriorly_connected_to|UBERON:attaches_to|PHENOSCAPE:extends_from|RO:0002150|PATO:decreased_in_magnitude_relative_to|BSPO:0000107|RO:0002202|BSPO:0000097|UBERON:distally_connected_to|BSPO:0000098|UBERON:encloses|PHENOSCAPE:extends_to|PATO:has_cross_section|UBERON:has_muscle_insertion|UBERON:has_muscle_origin|BFO:0000051|BSPO:0000123|BSPO:0000125|UBERON:in_lateral_side_of|BSPO:0000120|UBERON:in_median_plane_of|BSPO:0000122|BSPO:0000124|BSPO:0000121|PATO:increased_in_magnitude_relative_to|OBO_REL:located_in|RO:0002131|BFO:0000050|BSPO:passes_through|BSPO:0000099|UBERON:posteriorly_connected_to|BSPO:0000100|UBERON:proximally_connected_to|PATO:similar_in_magnitude_relative_to|RO:0002219|RO:0002221|BSPO:0000102|BSPO:0000103|PHENOSCAPE:serves_as_attachment_site_for|PHENOSCAPE:complement_of"; private ArrayList<String> states = new ArrayList<String>(); private ELKReasoner elkentity,elkquality,elkspatial; //init Hashtable<String, String> counts; ArrayList<String> fields = new ArrayList<String>(); private boolean nowislabel=false; private String runsetting; private int partialcounts = 0; /** * @param runsetting * */ public EQPerformanceEvaluation(String database, String testtable, String answertable, String prtable, String runsetting) { this.testtable = testtable; this.answertable = answertable; this.prtableEQs = prtable+"_EQs"; this.prtablefields = prtable+"_fields_"+runsetting; this.prtablestates = prtable+"_states_"+runsetting; this.substringcache= new Hashtable<String, Hashtable<String, Float>>(); this.equivalencecache = new Hashtable<String,Hashtable<String,String>>(); this.runsetting = runsetting; String ontodir = ApplicationUtilities.getProperty("ontology.dir"); String uberon = ontodir+System.getProperty("file.separator")+ApplicationUtilities.getProperty("ontology.uberon")+".owl"; String bspo = ontodir+System.getProperty("file.separator")+ApplicationUtilities.getProperty("ontology.bspo")+".owl"; String pato = ontodir+System.getProperty("file.separator")+ApplicationUtilities.getProperty("ontology.pato")+".owl"; //long startTime = System.currentTimeMillis(); try { this.elkentity = new ELKReasoner(new File(XML2EQ.uberon==null? uberon : XML2EQ.uberon), false); this.elkquality = new ELKReasoner(new File(XML2EQ.pato==null? pato :XML2EQ.pato), false); this.elkspatial = new ELKReasoner(new File(XML2EQ.bspo==null? bspo: XML2EQ.bspo), false); } catch (OWLOntologyCreationException e1) { LOGGER.debug("", e1); System.out.print("can't load reasoner"); System.exit(1); } //long stopTime = System.currentTimeMillis(); //System.out.println("time spent on init elks was " + (stopTime - startTime)/60000f + " minutes."); initFields(); try{ if(conn == null){ Class.forName("com.mysql.jdbc.Driver"); conn = DriverManager.getConnection(ApplicationUtilities.getProperty("database.url")); Statement stmt = conn.createStatement(); //Holds the precision and recall values of each and every fields String sql ="create table if not exists "+prtablefields+" (id TIMESTAMP DEFAULT CURRENT_TIMESTAMP primary key, " + "entitylabelp float(4,2), entitylabelr float(4,2), " + "entityidp float(4,2), entityidr float(4,2), " + "qualitylabelp float(4,2), qualitylabelr float(4,2), " + "qualityidp float(4,2), qualityidr float(4,2), " + "RelatedEntityLabelp float(4,2), RelatedEntityLabelr float(4,2), " + "RelatedEntityidp float(4,2), RelatedEntityidr float(4,2)" + ")"; stmt.execute(sql); //Holds the table level EQ Precision and recall values stmt.execute("create table if not exists "+prtableEQs+" (id TIMESTAMP DEFAULT CURRENT_TIMESTAMP primary key, " + "runsetting varchar(100),"+ "exactp float(4,2), exactr float(4,2)" + ")"); stmt.execute(sql); //Holds the state level EQ Precision and recall values LOGGER.debug("create table if not exists "+prtablestates+" (stateid varchar(100) primary key, " + "stateprecision float(4,2), staterecall float(4,2)" + ")"); stmt.execute("create table if not exists "+prtablestates+" (stateid varchar(100) primary key, " + "stateprecision float(4,2), staterecall float(4,2)" + ")"); stmt.execute("delete from "+prtablestates); } }catch(Exception e){ LOGGER.error("", e); } //long stopTime2 = System.currentTimeMillis(); //System.out.println("time spent on init fields and db was " + (stopTime2 - stopTime)/60000f + " minutes."); } private void initFields() { //this.fields.add("stateid"); this.fields.add("entitylabel"); this.fields.add("entityid"); this.fields.add("qualitylabel"); this.fields.add("qualityid"); this.fields.add("relatedentitylabel"); this.fields.add("relatedentityid"); /*this.fields.add("entity"); this.fields.add("quality"); this.fields.add("relatedentity");*/ } /** * get precision and recall measurements * precision = #matched/#generated * recall = #matched/#inanswer */ public void evaluate(){ //tallying try{ //collect all unique state ids Statement stmt = conn.createStatement(); ResultSet rs = stmt.executeQuery("select distinct characterid, stateid from "+this.answertable); while(rs.next()){ String characterid = rs.getString("characterid"); String stateid = rs.getString("stateid"); if(characterid.length() > 0 && stateid.length() > 0){ states.add(stateid); } } stmt.close(); LOGGER.debug(System.currentTimeMillis()); //long startTime = System.currentTimeMillis(); readResultsfromDatabase(); //long stopTime = System.currentTimeMillis(); //System.out.println("time spent on reading results was " + (stopTime - startTime)/60000f + " minutes."); compareFields();//precision and recall for each of the fields //long stopTime2 = System.currentTimeMillis(); //System.out.println("time spent on comparing fields was " + (stopTime2 - stopTime)/60000f + " minutes."); readResultsfromDatabase(); //long stopTime3 = System.currentTimeMillis(); //System.out.println("time spent on reading results [again] was " + (stopTime3 - stopTime2)/60000f + " minutes."); compareEQs(); //for raw/labeled EQ statements //long stopTime4 = System.currentTimeMillis(); //System.out.println("time spent on comare EQs was " + (stopTime4 - stopTime3)/60000f + " minutes."); LOGGER.debug(System.currentTimeMillis()); //compareNonOntologizedGS(); }catch(Exception e){ LOGGER.error("", e); } //long startTime = System.currentTimeMillis(); /*System.out.println("reasoning results: "); System.out.println("elkentity: subclass:"); Set<String> e = elkentity.subclasscache.keySet(); for(String key: e){ System.out.println(key+"=>"+elkentity.subclasscache.get(key)); } System.out.println("elkquality: subclass:"); e = elkquality.subclasscache.keySet(); for(String key: e){ System.out.println(key+"=>"+elkquality.subclasscache.get(key)); } System.out.println("elkspatial: subclass:"); e = elkspatial.subclasscache.keySet(); for(String key: e){ System.out.println(key+"=>"+elkspatial.subclasscache.get(key)); } System.out.println("elkentity: partof:"); e = elkentity.partofcache.keySet(); for(String key: e){ System.out.println(key+"=>"+elkentity.partofcache.get(key)); } System.out.println("elkquality: partof:"); e = elkquality.partofcache.keySet(); for(String key: e){ System.out.println(key+"=>"+elkquality.partofcache.get(key)); } System.out.println("elkspatial: partof:"); e = elkspatial.partofcache.keySet(); for(String key: e){ System.out.println(key+"=>"+elkspatial.partofcache.get(key)); } System.out.println("end of reasoning results "); */ this.elkentity.dispose(); this.elkquality.dispose(); this.elkspatial.dispose(); //long stopTime = System.currentTimeMillis(); //System.out.println("time spent on disposing elks was " + (stopTime - startTime)/60000f + " minutes."); System.out.println("total partial counts="+this.partialcounts); } private void compareNonOntologizedGS() throws SQLException { //get the set of stateid's //fetch from gold standard and store stateid and gslabel //capture the index differences of the original label id and nullified ones //get the corresponding label and extract the matching labels.store them # separated //fetch from charparser output and store stateid and charparser string //using stateid get the string and unontologized entity string //if unontologized string is present then use it, else parse through the string and extract the entities separately // separate proposals by @, compare the eq with each one of the GS and arrive at mazimum score Set<String> fields = Fieldgsnotontology.keySet(); Hashtable<String,Hashtable<String,String>> field_states = new Hashtable<String,Hashtable<String,String>>(); Hashtable<String,String> gsstates; Hashtable<String,String> cpstates; Hashtable<String,Float> count = new Hashtable<String,Float>(); for(String field:fields) { gsstates = new Hashtable<String,String>(); cpstates = new Hashtable<String,String>(); getNonOntologizedvaluefromdb(field,Fieldgsnotontology.get(field),gsstates,cpstates,count); compareNonOntologized(field,gsstates,cpstates,count); } for(String field:fields) { System.out.println(count.get("gs"+field)); System.out.println(count.get("cp"+field)); System.out.println(count.get("gsmatched"+field)); } } private void compareNonOntologized(String field,Hashtable<String, String> gsstates, Hashtable<String, String> cpstates, Hashtable<String,Float> count) { float match_score =0;//contains the total matched score Set<String> stateids = gsstates.keySet();//Fetches the states of all the objects //ArrayList<String> gsEQs = new ArrayList<String>(); for(String stateid:stateids)// Each of he state id's { String gsstrings = gsstates.get(stateid); String[] gsEQs = gsstrings.split("<>");//Each individual GS array String cpstrings = cpstates.get(stateid); String[] cpEQs = cpstrings.split("<>");//Each individual EQ array Hashtable<String,Float> gseq_matchedscore = new Hashtable<String,Float>();//contains the matched score of all the EQ's of this state //prepopulate 0 in the counts /* for(String gsEQ:gsEQs) { gseq_matchedscore.put(gsEQ, 0.0f);// This will fail, if the GSEQ term is same all over }*/ String matched_gs_final =""; for( String cpEQ: cpEQs) // each of charparser EQ's { float max_score_gs_cp_eq =0;//contains maximum score of a GS that best matches this CP String matched_gs =""; for(String gsEQ:gsEQs) // Each of the GS eq's - here we are comparing all GS against a single CP EQ { if(matched_gs_final.contains(gsEQ)==false) { String[] cpEQPs = cpEQ.split("@,");//Each charparser proposals float max_score_gs =0; String GS_CP_final_match =""; for(String cpEQP:cpEQPs)//Each of the charparser proposals { String[] cpindividuallabels = cpEQP.split("#");//each individual objects/entities in each of the proposal String[] gsindividuallabels = gsEQ.split("#");// each of the GS individual objects String pairedCP =""; float final_maxscore =0; for(String gsindividal:gsindividuallabels)// each individual GS objects { float score =0,maxscore=0;; String matchcp ="";//holds the perfect match for this individual gs entity for(String cpeachentities:cpindividuallabels)// each individual cp objects { score=getLabelMatchScore(cpeachentities,gsindividal,field);//returns string to label matching if((maxscore<score)&&(pairedCP.contains(cpeachentities)==false))//need to check contains******** { maxscore = score; matchcp=cpeachentities; } } pairedCP+=matchcp+" "; final_maxscore+=maxscore;// contains the maximum score of this GS with this CP proposal } final_maxscore /=gsindividuallabels.length;//it will give a score in the range of 0 to 1 -( maximum score of this GS with this proposal) if(max_score_gs<final_maxscore) { max_score_gs = final_maxscore;//stores the final maxscore of this GSeq with this cp } } if(max_score_gs_cp_eq<max_score_gs) { max_score_gs_cp_eq=max_score_gs; matched_gs = gsEQ;// contains the best matching GS(of all) for this cp EQ } } } matched_gs_final+=matched_gs+" ";////contains the best EQproposal for this GS gseq_matchedscore.put(matched_gs, max_score_gs_cp_eq);//store the maximum score of this GS with this CP match_score +=max_score_gs_cp_eq; } } count.put("gsmatched"+field, match_score); } private void getNonOntologizedvaluefromdb(String field, Hashtable<String, Hashtable<String, String>> gsnotontology, Hashtable<String, String> gsstates, Hashtable<String, String> cpstates, Hashtable<String, Float> count) throws SQLException { Statement stmt = conn.createStatement(); String rootfield = field.replace("id", ""); String sql; ResultSet rs; Set<String> stateids = gsnotontology.keySet();//this returns the stateid's float countgseq=0,countcpeq=0;//counts the total number of EQ's i charparser and Gold standard seprately for(String stateid:stateids) { //fetching from GS first Hashtable<String,String> eqs = gsnotontology.get(stateid);//this returns the original id string, ontology nullified strings Set<String> eqkeys = eqs.keySet();//this returns each of eq's original id's String multipleeqs =""; for(String eqkey:eqkeys)//this fetches the original label of each of the original id's { Hashtable<String,Integer> nonexist = nonexistentid(eqkey,eqs.get(eqkey));//returns the difference label between original copy and nullified copy along with the index sql = "select "+rootfield+"label from "+this.answertable+" where stateid = '"+stateid+"' and "+rootfield+"id = '"+eqkey+"'"; rs = stmt.executeQuery(sql); String unontologized_label =""; if(rs.next()) { String label= rs.getString(rootfield+"label"); Set<String> ids = nonexist.keySet(); for(String id:ids)//if multiple temp ids are prsent, their labels are retrieved and stored as "#" separated value { unontologized_label+=extractLabel(unontologized_label, nonexist.get(id))+"#"; } unontologized_label= unontologized_label.replaceAll("(#)$", ""); } multipleeqs +=unontologized_label+"<>";//<> is the delimiter countgseq++;//counts the number of EQ's in Gold standard } multipleeqs = multipleeqs.replaceAll("(<>)$", ""); gsstates.put(stateid, multipleeqs); //Stores stateid,eq's(separated by <>) //fetching charparser information sql = "select "+rootfield+"id, "+rootfield+"label, unontologized"+ rootfield+" from "+this.testtable+" where stateid = '"+stateid+"'"; rs = stmt.executeQuery(sql); String final_string=""; while(rs.next()) { String label = rs.getString(rootfield+"label"); String unontologized = rs.getString("unontologized"+rootfield); String rootfieldid = rs.getString(rootfield+"id"); if((unontologized!=null)&&(unontologized.equals("")==false)) { final_string += unontologized +"<>"; } else { String[] ids = extractids(rootfieldid); ids = clean(ids); String[] labels = label.split("@,"); for(String ilabel:labels) { for(int i=0;i<ids.length;i++) { final_string +=extractLabel(label, i)+"#"; } final_string +="@,"; } final_string.replaceAll("(@,)$", ""); final_string +="<>"; } countcpeq++;//counts the number of EQ's in this state generated by charparser } final_string.replaceAll("(<>)$", ""); cpstates.put(stateid, final_string);// contains charaparser's stateid, final_string which is made of terms inside an id separated by # and each EP separated by @,and EQ separated by <> } count.put("gs"+field,countgseq); count.put("cp"+field, countcpeq); } private void readResultsfromDatabase() throws SQLException { //LOGGER.debug("inside read results"); ResultSet rs; Statement stmt = conn.createStatement(); //pair up answer and test states astates = new ArrayList<ArrayList<Hashtable<String,String>>>(); tstates = new ArrayList<ArrayList<Hashtable<String,String>>>(); Iterator<String> it = states.iterator(); while(it.hasNext()){ String stateid = it.next(); ArrayList<Hashtable<String, String>> astate = new ArrayList<Hashtable<String, String>>(); rs = stmt.executeQuery("select entitylabel, entityid, qualitylabel, qualityid, relatedentityLabel, relatedentityid from "+ this.answertable+" where stateid = '"+stateid+"'"); //All EQ's associated with a state are populated in hash table while(rs.next()){ Hashtable<String, String> EQ = new Hashtable<String, String> (); EQ.put("stateid", stateid); for(String field: this.fields){ String v = rs.getString(field); if(v==null){v="";} EQ.put(field, v); } astate.add(EQ); } //LOGGER.debug("added ["+stateid+"] from answer"); astates.add(astate);// the state eq statements are grouped here => gold standard ArrayList<Hashtable<String, String>> tstate = new ArrayList<Hashtable<String, String>>(); rs = stmt.executeQuery("select entitylabel, entityid, qualitylabel, qualityid, relatedentityLabel, relatedentityid from "+ this.testtable+" where stateid = '"+stateid+"'"); while(rs.next()){ Hashtable<String, String> EQ = new Hashtable<String, String> (); for(String field: this.fields){ String v = rs.getString(field); if(v==null){v="";} EQ.put(field, v); } tstate.add(EQ); } tstates.add(tstate);//generated states //LOGGER.debug("added ["+stateid+"] from test"); } } /**************************************************** Fields ************************************* * * @param tstate * @param astate */ private void compareFields() { LOGGER.debug("Inside compare fields"); if(counts == null){ counts = new Hashtable<String, String> (); //init for(String field : this.fields){ counts.put("inanswer"+field, ""+0); //Gold standard counts.put("generated"+field, ""+0); //our charparser output counts.put("matchedr"+field, ""+0); // total number of matches (matching score) between our algo and gold standard for recall calculation counts.put("matchedp"+field, ""+0); // total number of matches (matching score) between our algo and gold standard for precision calculation } //put totals in for(String field : this.fields){ getTotal(field);//counts the number of tokens in a field } } //collecting matched field by field for(String field : this.fields){ //long startTime = System.currentTimeMillis(); if(field.matches("entity|relatedentity|quality")) continue; float wcount = 0; float tcount = 0; float acount = 0; float tempcount =0; Hashtable<String,Hashtable<String,String>> GSnotontology = new Hashtable<String,Hashtable<String,String>>(); // Stores stateid,<original string,modified string> LOGGER.debug("Field========"+field); for(int i = 0; i < astates.size(); i++){ LOGGER.debug("state==="+i); ArrayList<String> avalues = new ArrayList<String>(); //There is no guarantee that each entity is mapped to same quality of the corresponding gold standard LOGGER.debug("Gold standard total EQ"+astates.get(i).size()); for(Hashtable<String, String> EQ :astates.get(i)){//gold standard String v = EQ.get(field).toLowerCase(); String stateid = EQ.get("stateid"); if(v!=null && v.length()>0){ String[] vs = v.split("\\s*,\\s*");// handling multiple values in each statement(field level) => multiple EQ's for gold standard String copy =""; for(String v1 : vs){ if(v1.length()>0) { if(field.contains("id")==true) { copy=v1; v1 = checkInOntology(v1,field);//nullifies the term, if it is not present in ontology if(v1.equals(copy)==false) { Hashtable<String,String> temp = new Hashtable<String,String>(); if(GSnotontology.get(stateid)!=null) { GSnotontology.get(stateid).put(copy,v1);//stores the copy of the nullified term along with the original string, stateid's } else { temp.put(copy,v1); GSnotontology.put(stateid,temp); } } avalues.add(v1);// collects the field values from all the EQ's }else { avalues.add(v1); } } } } } Hashtable<String,Hashtable<String,Float>> topvalues = new Hashtable<String,Hashtable<String,Float>>(); int eqcount=0; int gscount = avalues.size(); LOGGER.debug("charparser total EQ"+tstates.get(i).size()); for(Hashtable<String, String> EQ :tstates.get(i)){// reference=> CharParser generated output LOGGER.debug("EQ"+eqcount); ArrayList<String> tvalues = new ArrayList<String>(); String v = EQ.get(field).toLowerCase(); if(v!=null && v.length()>0){ String[] vs = v.split("\\s*(@?,)\\s*");// Added ? by Hong 1/16/2014 for(String v1 : vs){ if(v1.length()>0) { if(v1.contains("score")==true) //lower case because of the application of .toLowerCase before { tvalues.add((v1.substring(0, v1.indexOf("score")-1)).trim());//holds all the entity proposals of this EQ statement }else { tvalues.add(v1.trim()); } } } } LOGGER.debug("total number of proposals"+tvalues.size()); //evaluate cost associated with each proposal and GS, grouping them Hashtable<String,Float> groups = new Hashtable<String,Float>(); for(int j = 0; j<avalues.size(); j++){//gold standard fields float maxscore = 0; String entityproposal =""; ArrayList<Float> otherscores = new ArrayList<Float>(); //LOGGER.debug("gold standard GS "+j); for(int k = 0; k < tvalues.size(); k++){// reference entity proposals String v1 = tvalues.get(k).replace("\"", ""); String a = avalues.get(j).replace("\"", ""); //here, all the possible combinations should be scored and the best should be retained if(field.matches(".*(id)")==true) { tempcount= getIdMatchScore(v1.toLowerCase(),a.toLowerCase(),field);// uses exact match, partial match using elk to find the similarity score //LOGGER.debug(tempcount); if(maxscore<tempcount) { maxscore=tempcount;//score alone is important } otherscores.add(tempcount); } else if(field.matches(".*(label)")==true) { tempcount= getLabelMatchScore(v1.toLowerCase(),a.toLowerCase(),field);// Uses exact match and METEOR evaluation parameter to find the similarity of sentences //LOGGER.debug(tempcount); if(maxscore<tempcount) { maxscore=tempcount; } otherscores.add(tempcount); } } //apply penalty to the maximum score, since many alternatives were proposed //maxscore = penalty(maxscore,otherscores,otherscores.size()); //had no effect. LOGGER.debug("gold standard GS "+j+"Max score ==== "+maxscore); groups.put("GS"+j, maxscore);// stores the best of eq (from GS) *eqp1 } topvalues.put("EQ"+eqcount++, groups);//Stores each EQ's best entity proposals //group should contain the maximum combination of generated*gold standard } //long stopTime1 = System.currentTimeMillis(); //System.out.println("time spent on grouping was " + (stopTime1 - startTime)/60000f + " minutes."); //the below code calculates the matching score of the best EP and E(GS) mapping //from a matrix with rows being EPs, columns being GS, and cells holding scores //the code find the greatest value in a column and then sum up those values from all columns and use the sum as the matching score of the best EP to GS mapping //This logic does not identify which of the machine-generated EQ best match which GS EQ. //This logic measures the extent of the 'semantics' in GS EQ is covered by machine-generated EQ set collectively. //float sum = 0f; for(int g = 0; g <gscount; g++){ Set<String> EQS = topvalues.keySet(); float max = 0f; //find the greatest value for GS_g column for(String EQ:EQS)//each EQ's { Hashtable<String,Float> gsgroups = topvalues.get(EQ);//gets the group of each EQ containing GS->EQ matching scores if(max < gsgroups.get("GS"+g)){ max = gsgroups.get("GS"+g); } } LOGGER.debug("max====="+max); counts.put("matchedr"+field, ""+(Float.parseFloat(counts.get("matchedr"+field))+max)); //sum += max; } Set<String> EQS = topvalues.keySet(); for(String EQ:EQS)//each EQ's { float max = 0f; //find the greatest value for EQ row Hashtable<String,Float> gsgroups = topvalues.get(EQ);//gets the group of each EQ containing GS->EQ matching scores for(int g = 0; g <gscount; g++){ if(max < gsgroups.get("GS"+g)){ max = gsgroups.get("GS"+g); } } LOGGER.debug("max====="+max); counts.put("matchedp"+field, ""+(Float.parseFloat(counts.get("matchedp"+field))+max)); //sum += max; } //the code below is logically incorrect. It takes any first encountered non-zero value as the max and not checking other scores. Hong 1/13/2014 /*String matched =""; Set<String> EQS = topvalues.keySet(); String tempgroup=""; for(String EQ:EQS)//each EQ's { Hashtable<String,Float> gsgroups = topvalues.get(EQ);//gets the group of each EQ containing GS->EQ matching scores Set<String> groups = gsgroups.keySet(); float max=0; tempgroup=""; for(String group:groups) { if(matched.contains(group)==false) { if(max<gsgroups.get(group)) { max = gsgroups.get(group); tempgroup=group; } } } matched+=" "+tempgroup; LOGGER.debug("max====="+max); counts.put("matched"+field, ""+(Float.parseFloat(counts.get("matched"+field))+max)); }*/ //long stopTime2 = System.currentTimeMillis(); //System.out.println("time spent on scoring was " + (stopTime2 - stopTime1)/60000f + " minutes."); // if(tempgroup!="") // topvalues.remove(tempgroup);// to make sure that this group is not considered again as one entity proposal has been already considered } //long stopTime = System.currentTimeMillis(); //System.out.println("time spent on comparing "+field+" was " + (stopTime - startTime)/60000f + " minutes."); if(field.contains("id")==true) { Fieldgsnotontology.put(field, GSnotontology); } } //calculate and output P/R measurements String prstring = ""; String fieldstring = ""; for(String field : this.fields){ fieldstring += field+"p,"+field+"r,"; LOGGER.debug("\t\t\t\t matchedrfield======"+counts.get("matchedr"+field)); LOGGER.debug("\t\t\t\t matchedpfield======"+counts.get("matchedp"+field)); LOGGER.debug("\t\t\t\tgeneratedfield======"+counts.get("generated"+field)); LOGGER.debug("\t\t\t inanswerfield======"+counts.get("inanswer"+field)); float p = Float.parseFloat(counts.get("generated"+field))==0? 0 : Float.parseFloat(counts.get("matchedp"+field))/Float.parseFloat(counts.get("generated"+field)); float r = Float.parseFloat(counts.get("inanswer"+field)) ==0? 0 : Float.parseFloat(counts.get("matchedr"+field))/Float.parseFloat(counts.get("inanswer"+field)); prstring += p+","+r+","; } prstring = prstring.replaceFirst(",$", ""); fieldstring = fieldstring.replaceFirst(",$", ""); insertInto(this.prtablefields, fieldstring, prstring); LOGGER.debug("End of compare fields"); } //If the id doesn't exists, it nullifies the id and return the string private String checkInOntology(String value, String field) { value = value.toUpperCase().trim(); if(existscache.get(value)!=null) return existscache.get(value); String[] ids = extractids(value); String valuecopy = value; Boolean exist=false; ELKReasoner tempelk=null; for(String id:ids) { exist=false; id=id.trim(); if(id.contains("BSPO")) { tempelk = this.elkspatial; } else if(field.contains("quality")) { tempelk = this.elkquality; }else { tempelk = this.elkentity; } exist=tempelk.CheckClassExistence(id); if(exist == false) { value=value.replaceFirst(id, "null"); } } existscache.put(valuecopy, value); return value.toLowerCase(); } private static String[] extractids(String value) { value=value.replaceAll("(\\(|\\))", ""); String[] temp = value.split("\\s"); String id =""; for(String t:temp) { if(t.matches("[A-Z]+[_:][0-9A-Z_-]+")==true) { id+=t+" "; } } id=id.trim(); return id.split(" "); } /* private boolean checkInOntology(String id) { String iri=""; if(id.startsWith(OWLAccessorImpl.temp)){ iri=Dictionary.provisionaliri+id.substring(id.indexOf(":")+1); }else{ iri=Dictionary.baseiri+id.replace(':', '_'); } if(allclasses.contains(iri.trim())) return true; else return false; } */ /* * * Calculates penalty for ID and label(proposals) * */ private float penalty(float maxscore, ArrayList<Float> otherscores, int totalsize) { //LOGGER.debug("inside penalty for EP"); //removing the max score from the list float meansquare =0; float standarddeviation =0; float finalscore=0,penalty =0; for(int i=0;i<otherscores.size();i++) { if(otherscores.get(i)==maxscore) { otherscores.remove(i); break; } } //calculating S.D for(int i=0;i<otherscores.size();i++) { meansquare+=(maxscore -otherscores.get(i))*(maxscore -otherscores.get(i)); } if(otherscores.size()>0) { standarddeviation = meansquare/otherscores.size(); standarddeviation=(float) Math.sqrt(standarddeviation); penalty = (float) (standarddeviation * Math.pow((double)(((float)totalsize-1)/(float)(totalsize)), 3)); } finalscore = maxscore - penalty; return finalscore; } private float getLabelMatchScore(String a, String v, String field) { float count=0; //TODO: remove some,partof, all if(a.length()>0 && v.length()>0 && (a.toLowerCase().equals(v.toLowerCase()))){ count +=1; } else { //Call meteor to get the score(closeness value of the two strings) count+=this.meteor(a, v); } return count; } /* * returns the matching score of entityId/quality ID * * @parameters a gold standard string * @parameeters v charparser algo generated string * * @return count holds the closeness score */ private float getIdMatchScore(String a, String v, String field) { //LOGGER.debug("inside getidmatch score"); float count=0; ELKReasoner elk =null; Hashtable<String,Float> substrings = new Hashtable<String, Float>();//holds substrings in candidate string along with the score Hashtable<String,String> equivalence = new Hashtable<String,String>();//hold matching substrings(candidate, reference) if(field.matches(".*(entity).*")==true) { elk = this.elkentity; } else { elk = this.elkquality; } if(a.length()>0 && v.length()>0) { //make a call to substring function, followed by replace substring function a= format(a); v= format(v); //The below cache is used to speed up the lookup process if(this.substringcache.get(a.trim()+","+v.trim())==null) { //getMatchingSubstrings(a,v,0,a.split(" ").length-1,substrings ,field,elk,equivalence); substring(a,v,substrings ,field,elk,equivalence); this.substringcache.put(a.trim()+","+v.trim(), substrings); this.equivalencecache.put(a.trim()+","+v.trim(), equivalence); } else { substrings = this.substringcache.get(a.trim()+","+v.trim()); equivalence = this.equivalencecache.get(a.trim()+","+v.trim()); } float avg = (a.split(" ").length + v.split(" ").length)/2; count = replaceSubString(a,v,substrings,equivalence);//count the number of LCS matches and other matches (other match count reduced to 1/2) to a, the answer. count = count/avg; // to reduce it to value of 0.0 - 1.0 } return count; } /** * * @param prtablefields2 */ private void insertInto(String tablename, String fieldstring, String prstring) { try{ Statement stmt = conn.createStatement(); LOGGER.debug("insert into "+tablename+"("+fieldstring+")"+" values ("+prstring+")"); LOGGER.debug(prstring); LOGGER.debug(fieldstring); stmt.execute("insert into "+tablename+"("+fieldstring+")"+" values ("+prstring+")"); }catch(Exception e){ LOGGER.error("", e); } } /** * count only the fields associated with a state statement * counts the number of entity/quality/relationship tokens in both gold standard and charparser output strings * * @param field */ private void getTotal(String field) { //LOGGER.debug("inside get total"); try{ Statement stmt = conn.createStatement(); //total for answers(Gold standard) int count = 0; ResultSet rs = stmt.executeQuery("select "+field+" from "+this.answertable+" where "+field+" is not null and length(trim("+field+"))>0 and length(stateid)>0"); while(rs.next()){ String t = rs.getString(1); if(t.length()>0) { t= format(t);//removes and|all|some, brackets() and replaces : with underscore "_" count++; } } counts.put("inanswer"+field, ""+(count++)); LOGGER.debug("inanswer"+field+ " "+(count)); //total for generated(our algorithm) count = 0; rs = stmt.executeQuery("select "+field+" from "+this.testtable+" where "+field+" is not null and length(trim("+field+"))>0 and length(stateid)>0"); while(rs.next()){ String t = rs.getString(1); if(t.length()>0) { t= format(t);//removes and|all|some, brackets() and replaces : with underscore "_" count++; } } counts.put("generated"+field, ""+(count++)); LOGGER.debug("generated"+field+ " "+(count)); }catch(Exception e){ LOGGER.error("", e); } } // private int getTokens(String t) { // // String temp=""; // Pattern p = Pattern.compile("((pato|bfo|uberon|bspo)_[\\d]+){1}"); // Matcher m = p.matcher(t); // // while(m.find()){ // temp +=" "+ m.group(1).trim(); // } // // return temp.trim().split(" ").length; // } /**************************************************** EQs ************************************* * compare EQ's as a whole * * @param tstate: EQs generated by the algorithm for a state * @param astate: EQs in answer key for a state * @throws SQLException */ private void compareEQs() throws SQLException { //raw LOGGER.debug("inside compare eq's"); nowislabel=false; int totalgenerated = 0;//charparser int totalinanswer = 0;//gold standard float eqmatchscore =0; float statescorer =0; float totalscorer =0; float statescorep =0; float totalscorep =0; String prstring = ""; String fieldstring = ""; float stateprecision=0; float staterecall =0; for(int i = 0; i<astates.size(); i++){//all states from Gold standard //long startTime = System.currentTimeMillis(); totalinanswer += astates.get(i).size();//Gives in number of EQ's in this state => gold standard totalgenerated += tstates.get(i).size();//Gives in number of EQ's in this state => our algorithm Hashtable<String,Hashtable<String,Float>> eqgroups = new Hashtable<String,Hashtable<String,Float>>(); int counter=0; statescorer=0; statescorep=0; int eqcount=0; LOGGER.debug("state"+i); for(Hashtable<String, String> tEQ : tstates.get(i)){ LOGGER.debug("EQ==="+eqcount++); String entity = tEQ.get("entityid");//contains entity proposals separated by comma String relatedentitylabel = tEQ.get("relatedentityid");//ditto String quality = tEQ.get("qualityid");//ditto quality=quality.replaceAll("\\[.*\\]", "").trim();//replacing anything inside bracket[] with space eqgroups.put("EQ"+counter++,matchAstates(entity, relatedentitylabel, quality, astates.get(i), ""));// EQ,(gs,scores) } //change the hash table to be GS -> EQ to maximize the score LOGGER.debug("sorting out EQ's"); //the below code calculates the matching score of the best EP and E(GS) mapping //from a matrix with rows being EPs, columns being GS, and cells holding scores //the code find the greatest value in a column and then sum up those values from all columns and use the sum as the matching score of the best EP to GS mapping //This logic does not identify which of the machine-generated EQ best match which GS EQ. //This logic measures the extent of the 'semantics' in GS EQ is covered by machine-generated EQ set collectively. //float sum = 0f; for(int g = 0; g <astates.get(i).size(); g++){ Set<String> EQS = eqgroups.keySet(); float max = 0f; //find the greatest value for GS_g column for(String EQ:EQS)//each EQ's { Hashtable<String,Float> eqmatch = eqgroups.get(EQ);//gets the group of each EQ containing GS->EQ matching scores LOGGER.debug("gs=== "+g+" match score"+eqmatch.get("GS"+g)); if(max < eqmatch.get("GS"+g)){ max = eqmatch.get("GS"+g); } } statescorer += max; } Set<String> EQS = eqgroups.keySet(); for(String EQ:EQS)//each EQ's { Hashtable<String,Float> eqmatch = eqgroups.get(EQ);//gets the group of each EQ containing GS->EQ matching scores float max = 0f; for(int g = 0; g <astates.get(i).size(); g++){ //find the greatest value for the EQ row LOGGER.debug("gs=== "+g+" match score"+eqmatch.get("GS"+g)); if(max < eqmatch.get("GS"+g)){ max = eqmatch.get("GS"+g); } } statescorep += max; } //this logic is incorrect. It doesn't get the global maximum. /*Set<String> keys = eqgroups.keySet();//gives all the eq's String gsmatched=""; for(String key:keys) { Hashtable<String,Float> eqmatch = eqgroups.get(key);//reading each of the EQ groups(gs,score) Set<String> keys2 = eqmatch.keySet(); float maxscore=0; String matched=""; LOGGER.debug(key); for(String key2:keys2) { LOGGER.debug("gs=== "+key2+" match score"+eqmatch.get(key2)); if((maxscore<eqmatch.get(key2))&&(gsmatched.contains(key2)==false)) //reading each of the eq's and finding the best match { maxscore=eqmatch.get(key2); matched=key2; } } gsmatched+=" "+matched;//used to track the gold standards already matched gsmatched=gsmatched.trim(); statescore+=maxscore; //long stopTime = System.currentTimeMillis(); //System.out.println("time spent on EQ"+states.get(i)+" was " + (stopTime - startTime)/60000f + " minutes."); }*/ LOGGER.debug("State score"+i+" "+statescorer); stateprecision = tstates.get(i).size()==0? 0 :(float)statescorep/tstates.get(i).size(); staterecall = astates.get(i).size()==0? 0 :(float)statescorer/astates.get(i).size(); fieldstring = "stateid,stateprecision,staterecall"; prstring ="'"+astates.get(i).get(0).get("stateid")+"',"+stateprecision+","+staterecall; this.insertInto(this.prtablestates, fieldstring, prstring); totalscorer+=statescorer; totalscorep+=statescorep; } fieldstring = "runsetting, exactp, exactr"; float precision = totalgenerated==0? 0 : (float)totalscorep/totalgenerated; float recall = totalinanswer==0? 0 : (float)totalscorer/totalinanswer; prstring = "'"+this.runsetting+"',"+ precision +","+ recall +""; this.insertInto(this.prtableEQs, fieldstring, prstring); } /** * match the set of 4 values to EQs in aState * the match EQ is removed from aState * @param entity * @param relatedentity * @param quality * @param aState * @param postfix: "label" or "" * @return 2-element int array: the first element is 1 (0) if there is (not) an exact match, the second element is 1 (0) if there is (not) an partial match */ private Hashtable<String, Float> matchAstates(String entity, String relatedentity, String quality, ArrayList<Hashtable<String, String>> aState, String suffix) { LOGGER.debug("inside match a state"); //one state may have N EQs float matchscore = 0; String entityproposals[] = entity.replace("\"","").split("(@,)"); String qualityproposals[] = quality.replace("\"","").split("(@,)"); String relatedentityproposals[] = relatedentity.replace("\"","").split("(@,)"); Hashtable<String,Float> group = new Hashtable<String,Float>(); //Gives score of all the proposals against all the gold standards for(int i = 0; i < aState.size(); i++){//Parsing through multiple EQ's of each state(gold standard) ArrayList<Float> otherscores = new ArrayList<Float>(); for(int j=0;j<entityproposals.length;j++)//parsing through multiple proposals { for(int k=0;k<qualityproposals.length;k++) { for(int p=0;p<relatedentityproposals.length;p++) { float epscore = matchInState(entityproposals[j], relatedentityproposals[p], qualityproposals[k], aState.get(i), suffix); if(epscore > matchscore) {//max of all states as the character's matchsize matchscore = epscore; //Not breaking out until, we find more appropriate match by iterating through the EQ's for that particular stateid } otherscores.add(epscore); } } } int totalsize= gettotalsize(entityproposals.length,qualityproposals.length,relatedentityproposals.length); matchscore= penalty(matchscore,otherscores,totalsize); group.put("GS"+i, matchscore);//holds the maximum closeness score for all the GS statement with this EQ LOGGER.debug("GS"+i+"matchscore"+matchscore); } return group; } private int gettotalsize(int entitysize, int qualitysize, int relatedentitysize) { /* int maxsize = entitysize; if(qualitysize>relatedentitysize) { if(maxsize<qualitysize) maxsize = qualitysize; }else { if(maxsize<relatedentitysize) maxsize = relatedentitysize; }*/ return entitysize+qualitysize+relatedentitysize; } /** * * @param entity * @param relatedentity * @param quality * @param EQ * @param suffix * @return 0 if entity and/or quality not match, otherwise, 1 or 2 to indicate a partial match, 3 an exact match of all 3 fields */ private float matchInState(String entity, String relatedentity,String quality, Hashtable<String, String> EQ, String suffix) { //LOGGER.debug("inside match in state"); float totalscore=0; if(entity.equals("")==false && entity.indexOf("Score")>=0) { entity = (entity.substring(0,entity.indexOf("Score")-1)).trim(); } if(relatedentity.equals("")==false && relatedentity.indexOf("Score")>=0) { relatedentity = (relatedentity.substring(0,relatedentity.indexOf("Score")-1)).trim(); } if(quality.equals("")==false && quality.indexOf("Score")>0) { quality = (quality.substring(0,quality.indexOf("Score")-1)).trim(); } if(quality.startsWith("Score")) quality=""; totalscore+=getIdMatchScore(entity.toLowerCase(),EQ.get("entityid").toLowerCase(),"entity"); totalscore+=getIdMatchScore(relatedentity.toLowerCase(),EQ.get("relatedentityid").toLowerCase(),"entity"); totalscore+=getIdMatchScore(quality.toLowerCase(),EQ.get("qualityid").toLowerCase(),"quality"); //Related entity, if present in both then only it should be taken into consideration if((relatedentity.toLowerCase().equals("")==true)&&(EQ.get("relatedentityid").toLowerCase().equals("")==true)) return totalscore/2; else return totalscore/3;//approximation to 1 } //------------------------------------------------------- /* * * populates the substring array * */ public void substring(String candidate,String reference,Hashtable<String,Float> substrings,String type,ELKReasoner elk,Hashtable<String,String> equivalence) { long start = System.currentTimeMillis(); long end; LOGGER.debug("Inside substring "+System.currentTimeMillis()); int[][] match = new int[candidate.split(" ").length][reference.split(" ").length]; float[][] scores = new float[candidate.split(" ").length][reference.split(" ").length]; String[][] matchingstring = new String[candidate.split(" ").length][reference.split(" ").length]; String c[] = candidate.split(" "); String r[] = reference.split(" "); for(int i=0;i<candidate.split(" ").length;i++) { //if(c[i].matches(".*(bspo|BSPO|UBERON|uberon|BFO|bfo|RO|ro).*")==true) //entities could come from CL, GO or other imported ontologies if(!c[i].matches(".*[pP][Aa][Tt][Oo].*")) { if(c[i].matches(".*[Bb][Ss][Pp][Oo].*")==false) elk = this.elkentity; else elk = this.elkspatial; } else { elk = this.elkquality; } for(int j=0;j<reference.split(" ").length;j++) { float score =0; //add code to ignore provisionalid's if(c[i].equals(r[j])) { score=1; } else if(elk.isSubClassOf("http://purl.obolibrary.org/obo/"+c[i].toUpperCase(),"http://purl.obolibrary.org/obo/"+r[j].toUpperCase())==true || elk.isSubClassOf("http://purl.obolibrary.org/obo/"+r[j].toUpperCase(),"http://purl.obolibrary.org/obo/"+c[i].toUpperCase())==true) { score=(float) 0.75; partialcounts++; } else if((c[i].matches(".*[pP][Aa][Tt][Oo].*")==false)&& ((elk.isPartOf("http://purl.obolibrary.org/obo/"+c[i].toUpperCase(),"http://purl.obolibrary.org/obo/"+r[j].toUpperCase())==true || elk.isPartOf("http://purl.obolibrary.org/obo/"+r[j].toUpperCase(),"http://purl.obolibrary.org/obo/"+c[i].toUpperCase())==true))) { score=(float) 0.75; partialcounts++; } else if(elk.isEquivalent(c[i], r[j]) == true) { score=(float) 1.0; partialcounts++; } if(score>0) { if(i==0||j==0) { match[i][j] =1; scores[i][j] = score; matchingstring[i][j] = c[i]; } else { match[i][j] = match[i-1][j-1]+1; scores[i][j] = scores[i-1][j-1]+score; matchingstring[i][j] = matchingstring[i-1][j-1]+" "+c[i]; } } } } end = System.currentTimeMillis(); LOGGER.debug("end of substring "+System.currentTimeMillis()); LOGGER.debug("Difference"+ (end-start)); getNonOverlappingSubstrings(match,matchingstring,scores,candidate.split(" "),reference.split(" "),substrings, equivalence); } /* * * Gets all the non overlapping substring */ private void getNonOverlappingSubstrings(int[][] match,String[][] matchingstring, float[][] scores,String input[],String reference[],Hashtable<String, Float> substrings,Hashtable<String, String> equivalence) { LOGGER.debug("Inside getnonoverlapping "+System.currentTimeMillis()); int rows= input.length; int columns = reference.length; int max = matrixIsZero(match,rows,columns); while(max!=0) { float finalscore=0; String finalstring=""; for(int i = rows-1;i>=0;i--) { for(int j=columns-1;j>=0;j--) { if(match[i][j] == max) { String candidatetemp=""; String referencetemp=""; finalscore = scores[i][j]; for(int count=0;count<max;count++)//breaking condition { if(((i-count)>=0)&&((j-count)>=0)) { candidatetemp = input[i-count]+" "+candidatetemp;//Holds the matching string in candidate referencetemp = reference[j-count]+" "+referencetemp;//Holds the equivalent strings in reference //correctmatrix(match,j-count,rows,columns); } else break; } correctmatrix(match,i,j,rows,columns,max); if(candidatetemp.equals("")==false) { substrings.put(candidatetemp.trim(), finalscore); equivalence.put(candidatetemp.trim(), referencetemp.trim()); } } } } max = matrixIsZero(match,rows,columns); } LOGGER.debug("end of getnonoverlapping "+System.currentTimeMillis()); } /* * * removes redundancy * it zeroes the columns and rows involved to make sure that no overlap happens again */ private static void correctmatrix(int[][] match, int currentrow, int currentcolumn, int row, int totalcolumns, int length) { //making all columns except current column as 0 length=length-1; for(int i=0;i<row;i++) { for(int j=(currentcolumn-length);j<currentcolumn;j++) { match[i][j] = 0; } } for(int i=0;i<row;i++) { if(match[i][currentcolumn]>0) { match[i][currentcolumn] =0; int k=i; for(int j=currentcolumn+1;j<totalcolumns;j++) { k++; if(((k<row)&&(j<totalcolumns))&&(match[k][j]>0)) { match[k][j]= match[k-1][j-1]+1; }else break; } } } //cleaning all the rows involved for(int i=(currentrow-length);i<currentrow;i++) { for(int j=0;j<totalcolumns;j++) { match[i][j] =0; } } for(int i=0;i<totalcolumns;i++) { if(match[currentrow][i]>0) { match[currentrow][i] = 0; int k=i; for(int j=currentrow+1;j<row;j++) { k++; if((k<totalcolumns)&&(match[j][k]>0)) { match[j][k]=match[j-1][k-1]+1; } else break; } } } } /* * returns maximum value in the current matrix else return 0, saying there is no match * */ private static int matrixIsZero(int[][] match, int rows, int columns) { int max=0; for(int i=0;i<rows;i++) { for(int j=0;j<columns;j++) { if(max<match[i][j]) { max= match[i][j]; } } } return max; } /* * * Calculates METEOR score which is used to tell the closeness of two sentences * @parameter candidate Represents the algorithm output * @parameter refernce Represents the goldstandard output */ public float meteor(String candidate,String reference) { //LOGGER.debug("inside meteor"); candidate = candidate.replaceAll("(\\(|\\))",""); reference = reference.replaceAll("(\\(|\\))",""); float matchedunigrams = (float)unigramMatcher(candidate,reference); float precision = (float) matchedunigrams/candidate.split(" ").length; float recall = (float) matchedunigrams/reference.split(" ").length; float alpha=(float) 1.0; float fmean = (10*(precision*recall))/((9*precision)+recall); float finalscore =(float) 0.0; float penalty; String substring; float chunklength= (float)maxChunks(candidate,reference,0,candidate.split(" ").length-1); // calculate penalty if(matchedunigrams>0) { penalty = (float) ((float)(0.5)* Math.pow((chunklength/matchedunigrams), 3));//chunklength shouldnt be greater than number of unigrams // calculate meteor score finalscore = (fmean)*(1-penalty); } return finalscore; } /* * @function returns the number of unigram matches between two strings * * */ private int unigramMatcher(String candidate, String reference) { //LOGGER.debug("inside unigram matcher"); String cand[] = candidate.split(" "); String ref[] = reference.split(" "); int matches=0; for(int i=0; i<cand.length;i++) { for(int j=0;j<ref.length;j++) { if(cand[i].equals(ref[j])==true) { ref[j]=""; matches+=1; break; } } } return matches; } /* * Calculate the number of matching chunks between the input and reference strings * @parameter candidate represents the candidate string * @parameter reference represents the reference string * * @Return the total number of chunks in a candidate string that matches the reference string */ private int maxChunks(String candidate,String reference,int start,int end) { //LOGGER.debug("inside max chunks"); String tokens[] = candidate.split(" "); String bigchunk=""; String substring; int chunklength=0; int chunkstart=-1; int chunkend=-1; // Find the maximum sized chunk for(int i=start;i<end+1;i++) { for(int j=end;j>=i;j--) { substring = substring(tokens,i,j); if((reference.matches("(^|.*?\\s+)("+substring+")(\\s+.*|$)")==true)&&(substring.split(" ").length)>chunklength) { chunklength = substring.split(" ").length; bigchunk=substring; chunkstart = i; chunkend = j; } } } if(chunklength!=0) { if(chunkstart>start && chunkend<end) { return 1+maxChunks(candidate,reference,start,chunkstart-1)+maxChunks(candidate,reference,chunkend+1,end); } else if((chunkstart==start)&&(chunkend!=end)) { return 1+maxChunks(candidate,reference,chunkend+1,end); } else if((chunkstart!=start)&&(chunkend==end)) { return 1+maxChunks(candidate,reference,start,chunkstart-1); } else { return 1; } } else { //substring = substring(tokens,start,end); //LOGGER.debug(substring); return 0 ; } } /* * @param type can be "entity" or "quality" * @param substringmatches holds the substring and the score of that substring * @param equivalence holds the matching substrings in compared strings */ /*private void getMatchingSubstrings(String candidate,String reference,int start, int end, Hashtable<String,Float> substrings,String type,ELKReasoner elk,Hashtable<String,String> equivalence) { //LOGGER.debug("inside getmatching substring"); //LOGGER.debug(candidate+" "+reference); if (candidate == null || reference == null || candidate.length() == 0 || reference.length() == 0) { return; } String c[] = candidate.split(" "); String r[] = reference.split(" "); String substring="",matchingrefstring =""; int chunklength = 0,maxi=-1,maxj=-1; //int clength = c.length>end?end:c.length; int rlength = r.length; int[][] matchtable = new int[end+1][rlength]; float[][] scoretable = new float[end+1][rlength]; float finalscore=(float) 0.0; String[][] matches = new String[end+1][rlength]; for (int i = start; i <= end; i++) { if(c[i].matches(".*(bspo|BSPO|UBERON|uberon|BFO|bfo|RO|ro).*")==true) { if(c[i].matches(".*(bspo|BSPO).*")==false) elk = this.elkentity; else elk = this.elkspatial; } else { elk = this.elkquality; } for (int j = 0; j < rlength; j++) { float score=0; //direct equal check or partial match a score should be assigned if(c[i].equals(r[j])) { score =1; } else if(elk.isSubClassOf("http://purl.obolibrary.org/obo/"+c[i].toUpperCase(),"http://purl.obolibrary.org/obo/"+r[j].toUpperCase())==true || elk.isSubClassOf("http://purl.obolibrary.org/obo/"+r[j].toUpperCase(),"http://purl.obolibrary.org/obo/"+c[i].toUpperCase())==true) { score=(float) 0.75; } else if((c[i].matches(".*(pato|PATO).*")==false)&&((elk.isPartOf("http://purl.obolibrary.org/obo/"+c[i].toUpperCase(),"http://purl.obolibrary.org/obo/"+r[j].toUpperCase())==true || elk.isPartOf("http://purl.obolibrary.org/obo/"+r[j].toUpperCase(),"http://purl.obolibrary.org/obo/"+c[i].toUpperCase())==true))) { score=(float) 0.75; } if (score>0) { if (i == 0 || j == 0) { matchtable[i][j] = 1; matches[i][j] = c[i]; scoretable[i][j] = score; } else { matchtable[i][j] = matchtable[i - 1][j - 1] + 1; matches[i][j] = ((matches[i-1][j-1]!=null?matches[i-1][j-1]:"")+" "+c[i]).trim(); scoretable[i][j] = scoretable[i-1][j-1]+score; } if (matchtable[i][j] > chunklength) { chunklength = matchtable[i][j]; substring = matches[i][j]; finalscore = scoretable[i][j]; maxi=i; maxj=j; } } } } if(chunklength!=0) { substrings.put(substring, finalscore); for(int i=maxj;i>maxj-chunklength;i--) matchingrefstring=r[i]+" "+matchingrefstring; matchingrefstring = matchingrefstring.trim(); equivalence.put(substring, matchingrefstring);//holds the equivalence candidate and reference substrings(to mainly handle partial matches) if(maxi-chunklength>=start && maxi<end) { getMatchingSubstrings(candidate,reference,start,maxi-chunklength,substrings,type,elk,equivalence); getMatchingSubstrings(candidate,reference,maxi+1,end,substrings,type,elk,equivalence); } else if((maxi-chunklength+1==start)&&(maxi!=end)) { getMatchingSubstrings(candidate,reference,maxi+1,end,substrings,type,elk,equivalence); } else if(((maxi-chunklength)+1!=start)&&(maxi==end)) { getMatchingSubstrings(candidate,reference,start,maxi-chunklength,substrings,type,elk,equivalence); } else { //LOGGER.debug("--end of function-----"); return; } } else { substrings.put(substring,finalscore); // LOGGER.debug("--end of function-----"); return; } }*/ /* * creates a substring with specified start and ending token values */ private static String substring(String[] tokens, int i, int j) { String substring=""; for( ;i<=j;i++) { substring+=tokens[i]+" "; } return substring.trim(); } /* * It takes the list of substrings that are common to candidate and reference strings. * Then it creates a character sequence and using LCS logic, it identifies and remove the LCS * It also calculates the closeness of two strings whose substring matching is given * * @parameter candidate answer */ private static float replaceSubString(String candidate, String reference, Hashtable<String, Float> substrings, Hashtable<String,String> substringmap) { //LOGGER.debug("inside replace substring"); char alphabets = 'a'; int i=0; float finalscore= 0; Hashtable<String,Float> chunkscore = new Hashtable<String,Float>(); Set<String> keys = substrings.keySet(); String candidatecopy = candidate; String referencecopy = reference; //replaces each matching chunk with an alphabet followed by @@(delimiter) in both reference and candidate strings keys = sort(keys);// The longest matching substring should be replaced first for(String key:keys) { if(key!="") { String replace = (String)((char)(alphabets+i)+"@@"); candidate = candidate.replace(key.trim(), replace.trim()); if(substringmap.get(key).equals(key))//to map the partial and exact matchings { reference = reference.replace(key.trim(), replace); } else { reference = reference.replace(substringmap.get(key).trim(), replace); } chunkscore.put(replace.replaceAll("@@", ""), substrings.get(key));//stores the scores of each of the chunks i++; } } String tokens[] = candidate.split(" "); //replacing unmatched tokens with alphabets @@ is used to distinguish alphabets from normal text for(int j=0;j<tokens.length;j++) { if((tokens[j].contains("@@")==false)&&(tokens[j].trim().equals("")==false)) { candidate=candidate.replace(tokens[j], (String)((char)(alphabets+i)+"@@")); reference=reference.replace(tokens[j], (String)((char)(alphabets+i)+"@@")); i++; } } tokens = reference.split(" "); //replacing unmatched tokens with alphabets in reference string for(int j=0;j<tokens.length;j++) { if(tokens[j].contains("@@")==false &&(tokens[j].trim().equals("")==false)) { reference=reference.replace(tokens[j], (String)((char)(alphabets+i)+"@@")); i++; } } //formatting candidate and reference strings to find LCS candidate = candidate.replaceAll("@@", "");//a reference = reference.replaceAll("@@", "");//a b c candidate = candidate.replaceAll(" ", ""); reference = reference.replaceAll(" ", ""); //find LCS and remove it ArrayList<String> position = lcs(candidate,reference,chunkscore);//holds the position of LCS in both candidate and reference strings tokens = position.get(0).split(" "); char can[] = candidate.toCharArray(); for(String index:tokens) { if(index!="") { finalscore+=chunkscore.get(can[Integer.parseInt(index)]+""); can[Integer.parseInt(index)] = '\0'; } } candidate = ""; for(char c:can)//rebuilding candidate string { if(c!='\0') candidate+=c; } tokens = position.get(1).split(" "); char ref[] = reference.toCharArray(); for(String index:tokens) { if(index!="") { ref[Integer.parseInt(index)] = '\0'; } } reference = ""; for(char r:ref)//rebuilding reference string { if(r!='\0') reference+=r; } //the final score of the LCS is added with other matching chunks but those not in LCS. Since order is mismatched a penalty of halving is applied here for(char c:candidate.toCharArray())//<=== { if(reference.contains(c+"")==true) { finalscore+=((chunkscore.get(c+""))/2); reference.replaceFirst(c+"", ""); } } return finalscore; //LOGGER.debug(candidate); //LOGGER.debug(reference); } //Sorts the given set according to length of the key private static Set<String> sort(Set<String> keys) { //LOGGER.debug("Inside sort"); LinkedHashSet<String> temp1 = new LinkedHashSet<String>(); String temp2[] = new String[keys.size()]; String temp; int i=0; for(String key:keys) { temp2[i++] = key; } for(int j=0;j<i;j++) { for(int k=j+1;k<i;k++) { if(temp2[j].split(" ").length<temp2[k].split(" ").length) { temp=temp2[j]; temp2[j]=temp2[k]; temp2[k]=temp; } } temp1.add(temp2[j]); } return temp1; } /* * * returns the longest common subsequence * If there are more than one LCS, then the one with highest score is chosen * * */ public static ArrayList<String> lcs(String a, String b, Hashtable<String, Float> chunkscore) { //LOGGER.debug("Inside LCS"); int[][] lengths = new int[a.length()+1][b.length()+1]; String candidateposition =""; String referenceposition =""; ArrayList<String> position = new ArrayList<String>(); int max=0; // row 0 and column 0 are initialized to 0 already for (int i = 0; i < a.length(); i++) { for (int j = 0; j < b.length(); j++) { if (a.charAt(i) == b.charAt(j)) { lengths[i+1][j+1] = lengths[i][j] + 1; } else { lengths[i+1][j+1] = Math.max(lengths[i+1][j], lengths[i][j+1]); } if(lengths[i+1][j+1]>max) max = lengths[i+1][j+1]; } } //Used to identify all the LCS positions String pos[] = new String[a.length()+1]; int k=0; for (int i = 0; i < a.length()+1; i++) { for (int j = 0; j < b.length()+1; j++) { if(lengths[i][j]==max) { pos[k++] = ""+i+","+j; break; } } } // read the substrings out from the matrix StringBuffer sb; StringBuffer finalsb = null; String finalcandidateposition=""; String finalreferenceposition=""; for(int p=0;p<k;p++) { sb = new StringBuffer(); candidateposition=""; referenceposition =""; for (int x = Integer.parseInt(pos[p].split(",")[0]), y = Integer.parseInt(pos[p].split(",")[1]); x != 0 && y != 0; ) { if (lengths[x][y] == lengths[x-1][y]) { x--; } else if (lengths[x][y] == lengths[x][y-1]) { y--; } else { assert a.charAt(x-1) == b.charAt(y-1); sb.append(a.charAt(x-1)); candidateposition=(x-1)+" "+candidateposition; referenceposition=(y-1)+" "+referenceposition; x--; y--; } if(finalsb==null) { finalsb=sb; finalcandidateposition=candidateposition; finalreferenceposition = referenceposition; } else { finalsb = resolveBasedOnScore(finalsb,sb,chunkscore); if(finalsb.equals(sb)==true) { finalcandidateposition=candidateposition; finalreferenceposition = referenceposition; } } } } position.add(finalcandidateposition.trim()); position.add(finalreferenceposition.trim()); //LOGGER.debug(sb.reverse().toString()); return position; } private static StringBuffer resolveBasedOnScore(StringBuffer finalsb, StringBuffer sb, Hashtable<String, Float> chunkscore) { char temp[]=finalsb.toString().toCharArray(); float score1=0,score2=0; for(int i=0;i<temp.length;i++) { score1+=chunkscore.get(temp[i]+""); } temp=sb.toString().toCharArray(); for(int i=0;i<temp.length;i++) { score2+=chunkscore.get(temp[i]+""); } if(score1>score2) return finalsb; else return sb; } private static String format(String token) { token = token.replaceAll("(and |some |all )", ""); token = token.replaceAll("(\\(|\\))", ""); token = token.replace(":", "_"); return token; } private static Hashtable<String, Integer> nonexistentid(String original, String modified) { String originals[] = extractids(original); String modifieds[] = extractids(modified); originals = clean(originals); modifieds = clean(modifieds); Hashtable<String,Integer> nonexists = new Hashtable<String,Integer>(); for(int j=0;j<originals.length;j++) { int i=0; for(i=0;i<modifieds.length;i++) { if(originals[j].equals(modifieds[i])==true) { modifieds[i]=""; break; } } if(i==modifieds.length) { nonexists.put(originals[j], j); } } return nonexists; } private static String[] clean(String[] idarray) { String id =""; for(String t:idarray) { if(t.matches("[A-Z]+[_:][0-9]+")==true) { if(t.matches("("+relationid+")")==false) id+=t+" "; } } id=id.trim(); return id.split(" "); } private static String extractLabel(String label,int index) { /* label= label.replaceAll(relation, ""); label= label.replaceAll("(\\(|\\))", ""); label = label.replaceAll("(\\s)+", " "); System.out.println(label.split("\\s")[index]); return "";*/ String labels[]=null,final_labels[] = null; int count=0; label= label.replaceAll("(\\(|\\))", ""); labels = label.split(relation); final_labels = new String[labels.length]; for(String temp:labels) { if(temp.equals(" ")==false) { final_labels[count++] = temp; } } System.out.println(final_labels[index].trim()); return final_labels[index].trim(); } /** * @param args */ public static void main(String[] args) { //String database = "biocreative2012"; //String candidate = "the cat sat on the mat"; // String reference = "the cat was sat on the mat"; //LOGGER.debug(meteor(candidate,reference,1,1)); //String resulttable = ApplicationUtilities.getProperty("table.output"); //long startTime = System.currentTimeMillis(); //String resulttable = "test_equivalent"; //String goldstandard = "goldstandard"; //EQPerformanceEvaluation pe = new EQPerformanceEvaluation(database, resulttable, goldstandard,"evaluationrecords", "test_is_equivalent"); //pe.evaluate(); //intercurator comparison /*String database = "charaparsereval2013"; String[] resulttable = new String[]{"naive_38484", "naive_38484", "naive_40674", "knowledge_40716", "knowledge_40716","knowledge_40717", "naive_38484", "naive_40674", "naive_40676"}; String[] goldstandard = new String[]{"naive_40674","naive_40676", "naive_40676", "knowledge_40717", "knowledge_40718","knowledge_40718", "knowledge_40716", "knowledge_40717", "knowledge_40718"}; String[] setting = new String[]{"c38484_c40674", "c38484_c40676","c40674_c40676", "c40716_c40717","c40716_c40718","c40717_c40718", "c38484_c40716", "c40674_c40717", "c40676_c40718" };*/ String database =ApplicationUtilities.getProperty("database.name"); String [] ids = new String[]{"naive_38484", "naive_40674", "naive_40676", "knowledge_40717", "knowledge_40718", "knowledge_40716"}; for(int i = 0; i<6; i++){ for(int j = 0; j < 6; j++){ System.out.println("Evaluation with "+database + "," + ids[i]+ "," + ids[j]+ "," +"evaluationrecords" + "," + ids[i]+"_"+ids[j]); EQPerformanceEvaluation pe = new EQPerformanceEvaluation(database, ids[i], ids[j],"evaluationrecords", ids[i]+"_"+ids[j]+"_sym"); pe.evaluate(); } } //EQPerformanceEvaluation pe = new EQPerformanceEvaluation("charaparsereval2013", "naive_38484", "knowledge_40717","evaluationrecords", "debug"); //pe.evaluate(); //long stopTime = System.currentTimeMillis(); //System.out.println("Elapsed time was " + (stopTime - startTime)/60000f + " minutes."); //maxChunks("28135","13528635",0,3); // String candidate ="UBERON:0002743 and (OBO_REL_part_of some (UBERON:0004741 and (OBO_REL_part_of some UBERON:0011683)))"; // String reference = "UBERON:0007831 and (OBO_REL_part_of some (UBERON:0011648 and (OBO_REL_part_of some UBERON:0002743)))"; // LinkedHashMap<String,Float> temp = new LinkedHashMap<String,Float>(); // Hashtable<String,String> equivalence = new Hashtable<String,String>(); // // ELKReasoner elk = null; // try { //// elk = new ELKReasoner(new File(ApplicationUtilities.getProperty("ontology.dir")+System.getProperty("file.separator")+"ext.owl")); // } catch (OWLOntologyCreationException e) { // // TODO Auto-generated catch block // e.printStackTrace(); // } //start should be zero and end should be length of candidate string -1 //getMatchingSubstrings("2 8 1 3 5 2 8 7","1 3 5 1 3 8 5 6 3 5 2",0,7,temp,"exact",elk); // candidate = format(candidate); // reference = format(reference); // getMatchingSubstrings(candidate,reference,0,candidate.split(" ").length-1,temp,"entity",elk,equivalence); // LOGGER.debug("final score====="+replaceSubString(candidate,reference,temp,equivalence)); // candidate= candidate.replace(" ", ""); // reference = reference.replace(" ", ""); } }